# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environmet of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
if (!require("ChannelAttribution")) install.packages("ChannelAttribution")
if (!require("plotly")) install.packages("plotly")
if (!require("jsonlite")) install.packages("jsonlite")
if (!require("reshape")) install.packages("reshape")
library(jsonlite)
library(plotly)
library(readr)
library(tidyverse)
library(ChannelAttribution)
library(ggplot2)
library(reshape)
#Sales call Dataframes
PhoneCalls20161001_20170228 <- read_csv("PhoneCalls20161001_20170228.csv") #From 2016-10-01 to 2017-02-28
PhoneCalls20170301_20170630 <- read_csv("PhoneCalls20170301_20170630.csv") #From 2017-03-01 to 2017-06-30
#Agent acquisition date by ZUID
agent_acquisition_date <- read_csv("AgentAcquisitionDates.csv")
#AgentID to ZUID mapping
agentid_zuid <- read_csv("AgentIDZUIDLookup.csv")
#Lead table
agent_lead <- read_csv("AgentLeads.csv")
#Sales meeting table
sales_meeting <- read_csv("SalesMeetings.csv")
#Union 2 Call Dataframes. Output: Call Dataframe from 2016-10-01 to 2017-06-30
phone_call <- bind_rows(PhoneCalls20161001_20170228, PhoneCalls20170301_20170630)
#Function returning the percentage of missing values for each column.
function_count_na <- function(df){
percent_missing <-sapply(df, function(y) round(100*sum(length(which(is.na(y))))/sum(length(y)),2))
percent_missing <- data.frame(percent_missing)
percent_missing
}
function_count_na(phone_call) #REAgentID missing 69.6%, SalesRepID missing 0.5%, 3 other variables no missing data
## percent_missing
## REAgentID 69.56
## SalesRepID 0.54
## PhoneCallType 0.00
## PhoneCallDateTime 0.00
## TalkTimeMinutes 0.00
function_count_na(agent_lead)
## percent_missing
## REAgentID 5.68
## SalesRepID 14.87
## LeadPlatform 0.00
## LeadType 24.52
## LeadVendor 68.18
## LeadDateTime 0.00
function_count_na(sales_meeting) #no missing data
## percent_missing
## REAgentID 0
## SalesRepID 0
## SalesMeetingDate 0
function_count_na(agentid_zuid) #no missing data
## percent_missing
## REAgentID 0
## ZUID 0
function_count_na(agent_acquisition_date) #Zuid missing 0%, AcquisitionDate missing 14.5%
## percent_missing
## ZUID 0.00
## AcquisitionDate 14.54
#Check date range of agent_lead
max(agent_lead$LeadDateTime) #"2017-06-29 23:30:28 UTC"
## [1] "2017-06-29 23:30:28 UTC"
min(agent_lead$LeadDateTime) #"2016-10-01 00:00:49 UTC"
## [1] "2016-10-01 00:00:49 UTC"
#Check date range of agent_acquisition_date
max(na.omit(agent_acquisition_date$AcquisitionDate)) #"2017-07-05 UTC"
## [1] "2017-07-05 UTC"
min(na.omit(agent_acquisition_date$AcquisitionDate)) #"2009-01-01 UTC"
## [1] "2009-01-01 UTC"
#Check date range of sales_meeting
max(sales_meeting$SalesMeetingDate) #"2017-06-30 UTC"
## [1] "2017-06-30 UTC"
min(sales_meeting$SalesMeetingDate) #"2017-03-13 UTC"
## [1] "2017-03-13 UTC"
Observation
Lead table: * From Oct 2017 to June 2017
Acquisition table: * From Jan 2009 to July 2017 * Solution: Delete data before Oct 2017
Sales Meeting table: * Only from March 2017 to June 2017 * Comment: Limited information, might not be information for data before march 2017
#Filter agent_acquisition_date before 2016-10-01
agent_acquisition_date <- subset(agent_acquisition_date, AcquisitionDate >= "2016-10-01")
Result: 18,086 rows (originally 161,006 rows )
#Search for all unique lead vendors
agent_lead %>% group_by(agent_lead$LeadType) %>% distinct(LeadVendor)
## # A tibble: 88 x 2
## # Groups: agent_lead$LeadType [6]
## LeadVendor `agent_lead$LeadType`
## <chr> <chr>
## 1 <NA> <NA>
## 2 email Email
## 3 Affiliate Paid
## 4 <NA> Organic
## 5 Facebook Paid
## 6 facebook Paid
## 7 Gmail Paid
## 8 Google Search Paid
## 9 Gdn Paid
## 10 email_welcome Email
## # … with 78 more rows
#Clean up marketing channel assignments
agent_lead <-agent_lead %>% mutate(LeadType=replace(LeadType, LeadType== "Social Organic", "Organic"),
LeadVendor=replace(LeadVendor, LeadType == "Organic", "Organic"),
LeadVendor=replace(LeadVendor, LeadVendor == "organic", "Organic"),
LeadVendor=replace(LeadVendor, LeadType == "Organic", "Organic"),
LeadVendor=replace(LeadVendor, LeadType == "unknown", "Unspecified"),
LeadVendor=replace(LeadVendor, LeadVendor == "unknown", "Unspecified"),
LeadVendor=replace(LeadVendor, LeadVendor== "Gdn", "Google Search"),
LeadVendor=replace(LeadVendor, LeadVendor== "GDN", "Google Search"),
LeadVendor=replace(LeadVendor, LeadVendor== "Affiliate", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "Linkedin Ads", "Linkedin"),
LeadVendor=replace(LeadVendor, LeadVendor== "Linkedin Display", "Linkedin"),
LeadVendor=replace(LeadVendor, LeadVendor== "Great_Schools", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "Trulia", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "gemini", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "internal", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "IronTraffic", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "4197532", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "brandnetworks", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "MSN_Real_Estate", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "Unspecified" & LeadType == "Paid", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "http://www.2020propertygroup.com/monthly-payment-calculator/", "Paid Other"),
LeadVendor=replace(LeadVendor, is.na(LeadVendor) & LeadType == "Paid", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "AreaVibes", "Paid Other"),
LeadType=replace(LeadType, LeadVendor == "Organic" & LeadType == "Paid", "Paid Other"),
LeadVendor=replace(LeadVendor, LeadVendor== "facebook" | LeadVendor== "Facebook" |
LeadVendor== "instagram" |LeadVendor== "Instagram"|
LeadVendor == "Twitter" |LeadVendor== "Linkedin", "Paid Social"),
LeadVendor=replace(LeadVendor, LeadVendor== "gmail"|LeadVendor == "Gmail" |
LeadVendor == "Outlook" | LeadVendor == "Yahoo", "Email"),
LeadType=replace(LeadType, LeadVendor== "Email", "Email"),
LeadVendor = replace(LeadVendor, LeadType == "Email", "Email"),
LeadVendor = replace(LeadVendor, is.na(LeadVendor), "Unspecified"),
LeadType = replace(LeadType, is.na(LeadType), 'Unspecified'))
#Remove Unspecified non paid channels.
#agent_lead <- subset(agent_lead, LeadVendor != "Unspecified")
agent_lead
## # A tibble: 271,890 x 6
## REAgentID SalesRepID LeadPlatform LeadType LeadVendor
## <chr> <chr> <chr> <chr> <chr>
## 1 00340000… <NA> Web Unspeci… Unspecifi…
## 2 00333000… 005330000… Web Unspeci… Unspecifi…
## 3 00333000… 005330000… Web Email Email
## 4 00340000… <NA> Web Unspeci… Unspecifi…
## 5 00340000… <NA> Web Unspeci… Unspecifi…
## 6 00333000… 005400000… Web Unspeci… Unspecifi…
## 7 00333000… 005330000… Web Unspeci… Unspecifi…
## 8 00340000… <NA> Web Unspeci… Unspecifi…
## 9 00340000… <NA> Web Unspeci… Unspecifi…
## 10 00340000… <NA> Web Unspeci… Unspecifi…
## # … with 271,880 more rows, and 1 more variable: LeadDateTime <dttm>
#Create a Marketing Channel aggregate dataframe
Channel_Data <- agent_lead %>%
group_by(LeadVendor) %>%
summarise(Total = n())
Channel_Data$Cost <- c(80, 10, 100, 0, 110, 150, 0)
Channel_Data$Total_Cost <- Channel_Data$Total*Channel_Data$Cost
Channel_Data$NPV <- c(7000, 8000, 7000, 7500, 7000, 7000, 7500)
#Create a Sales Channel calls aggregate dataframe
Channel_Calls <- phone_call %>%
filter(TalkTimeMinutes >= 0.5) %>%
group_by(PhoneCallType) %>%
summarise(Total = n())
Channel_Calls <- Channel_Calls %>% dplyr::rename(LeadVendor = PhoneCallType)
Channel_Calls$Cost <- c(10, 10)
Channel_Calls$Total_Cost <- Channel_Calls$Total*Channel_Calls$Cost
Channel_Calls$NPV <- c(7500, 7500)
#Create a Sales Channel meetings aggregate dataframe
Channel_Meet <- sales_meeting %>%
summarise(Total = n())
Channel_Meet$Cost <- c(300)
Channel_Meet$Total_Cost <- Channel_Meet$Total*Channel_Meet$Cost
Channel_Meet$NPV <- c(7500)
Channel_Meet$LeadVendor <- "Meeting"
#Bind into an aggregate channels dataframe that will be useful later.
Channel_Data <- bind_rows(Channel_Data, Channel_Calls, Channel_Meet)
Channel_Data <- Channel_Data %>%
arrange(LeadVendor) %>%
dplyr::rename(channel_name = LeadVendor)
Channel_Data
## # A tibble: 10 x 5
## channel_name Total Cost Total_Cost NPV
## <chr> <int> <dbl> <dbl> <dbl>
## 1 Bing Search 2489 80 199120 7000
## 2 Email 18403 10 184030 8000
## 3 Google Search 16112 100 1611200 7000
## 4 Meeting 1003 300 300900 7500
## 5 Organic 148132 0 0 7500
## 6 Paid Other 7115 110 782650 7000
## 7 Paid Social 12806 150 1920900 7000
## 8 RE Agent Called Sales Rep 4883311 10 48833110 7500
## 9 Sales Rep Called RE Agent 2411389 10 24113890 7500
## 10 Unspecified 66833 0 0 7500
#Left join agentid_zuid AND agent_acquisition_date ON zuid
agentid_zuid_acquisitionDate <- left_join(agentid_zuid, agent_acquisition_date, by = "ZUID")
#Remove all ids without an acquisition date
agentid_zuid_acquisitionDate <- subset(agentid_zuid_acquisitionDate, !is.na(AcquisitionDate || !is.na(REAgentID)))
#Joing meeting with acquisition date
meeting_with_zuid <- sales_meeting %>% left_join(agentid_zuid_acquisitionDate, by = "REAgentID")
meeting_with_zuid <- meeting_with_zuid %>% filter(AcquisitionDate >= SalesMeetingDate | is.na(AcquisitionDate))
meeting_with_zuid
## # A tibble: 878 x 5
## REAgentID SalesRepID SalesMeetingDate ZUID AcquisitionDate
## <chr> <chr> <dttm> <dbl> <dttm>
## 1 0033300001f… 00533000002… 2017-03-13 00:00:00 3.81e6 NA
## 2 0034000001A… 00533000002… 2017-03-13 00:00:00 3.71e7 NA
## 3 0034000001A… 00540000001… 2017-03-21 00:00:00 1.30e7 NA
## 4 0034000001Q… 00540000001… 2017-03-21 00:00:00 2.09e7 NA
## 5 0034000001B… 00540000001… 2017-03-21 00:00:00 2.74e7 NA
## 6 0033300001l… 00540000001… 2017-03-21 00:00:00 7.03e6 NA
## 7 0034000000k… 00540000001… 2017-03-21 00:00:00 3.75e6 NA
## 8 0034000000g… 00540000001… 2017-03-21 00:00:00 2.24e6 NA
## 9 0034000000Q… 00540000001… 2017-03-22 00:00:00 1.31e6 NA
## 10 0033300001p… 00540000001… 2017-03-22 00:00:00 6.58e7 NA
## # … with 868 more rows
#Filter call data to calls over 2 minutes that have agent ids
phone_call<-subset(phone_call, phone_call$TalkTimeMinutes >= 2 & !is.na(REAgentID))
#Joing meeting with acquisition date
phoneCall_with_zuid <- phone_call %>% left_join(agentid_zuid_acquisitionDate, by = "REAgentID")
phoneCall_with_zuid <- subset(phoneCall_with_zuid, PhoneCallDateTime < AcquisitionDate | is.na(AcquisitionDate) | !is.na(REAgentID))
phoneCall_with_zuid
## # A tibble: 531,956 x 7
## REAgentID SalesRepID PhoneCallType PhoneCallDateTime TalkTimeMinutes
## <chr> <chr> <chr> <dttm> <dbl>
## 1 00333000… 005330000… Sales Rep Ca… 2016-10-01 08:52:21 4.2
## 2 00340000… 005330000… RE Agent Cal… 2016-10-01 09:28:21 8.5
## 3 00340000… 005330000… RE Agent Cal… 2016-10-01 10:14:55 3.3
## 4 00340000… 005330000… Sales Rep Ca… 2016-10-01 10:28:51 11.9
## 5 00340000… 005330000… Sales Rep Ca… 2016-10-01 10:50:57 2.1
## 6 00340000… 005330000… RE Agent Cal… 2016-10-01 11:01:17 2.2
## 7 00340000… 005330000… Sales Rep Ca… 2016-10-01 11:32:49 2.1
## 8 00340000… 005330000… Sales Rep Ca… 2016-10-01 11:42:17 2.4
## 9 00333000… 005330000… Sales Rep Ca… 2016-10-01 11:58:27 2.2
## 10 00340000… 005400000… Sales Rep Ca… 2016-10-01 12:43:06 3.8
## # … with 531,946 more rows, and 2 more variables: ZUID <dbl>,
## # AcquisitionDate <dttm>
#Joing leads with acquisition date
lead_with_zuid <- left_join(agent_lead, agentid_zuid_acquisitionDate, by = "REAgentID")
lead_with_zuid <- subset(lead_with_zuid, LeadDateTime < AcquisitionDate || is.na(AcquisitionDate) || !is.na(REAgentID))
lead_with_zuid
## # A tibble: 271,890 x 8
## REAgentID SalesRepID LeadPlatform LeadType LeadVendor
## <chr> <chr> <chr> <chr> <chr>
## 1 00340000… <NA> Web Unspeci… Unspecifi…
## 2 00333000… 005330000… Web Unspeci… Unspecifi…
## 3 00333000… 005330000… Web Email Email
## 4 00340000… <NA> Web Unspeci… Unspecifi…
## 5 00340000… <NA> Web Unspeci… Unspecifi…
## 6 00333000… 005400000… Web Unspeci… Unspecifi…
## 7 00333000… 005330000… Web Unspeci… Unspecifi…
## 8 00340000… <NA> Web Unspeci… Unspecifi…
## 9 00340000… <NA> Web Unspeci… Unspecifi…
## 10 00340000… <NA> Web Unspeci… Unspecifi…
## # … with 271,880 more rows, and 3 more variables: LeadDateTime <dttm>,
## # ZUID <dbl>, AcquisitionDate <dttm>
#Creating dataframes that can be bound together
meeting <- meeting_with_zuid %>%
select(REAgentID, SalesMeetingDate, AcquisitionDate) %>%
dplyr::rename(Date = SalesMeetingDate)
phone <- phoneCall_with_zuid %>%
select(REAgentID, PhoneCallType, PhoneCallDateTime, AcquisitionDate) %>%
dplyr::rename(Channel = PhoneCallType, Date = PhoneCallDateTime)
leads <- lead_with_zuid %>%
select(REAgentID, LeadVendor, LeadDateTime, AcquisitionDate) %>%
dplyr::rename(Channel = LeadVendor, Date = LeadDateTime)
master <- bind_rows(leads, phone, meeting)
#Replace NA in Channel column
master$Channel[is.na(master$Channel)] <- "Meeting"
master <- master %>% filter(!is.na(REAgentID)) %>%
mutate(Conversion = ifelse(is.na(AcquisitionDate), 0, 1))
master
## # A tibble: 789,273 x 5
## REAgentID Channel Date AcquisitionDate Conversion
## <chr> <chr> <dttm> <dttm> <dbl>
## 1 0034000001S… Unspeci… 2016-10-01 00:00:49 NA 0
## 2 0033300001p… Unspeci… 2016-10-01 00:02:06 NA 0
## 3 0033300001p… Email 2016-10-01 00:02:56 NA 0
## 4 0034000001S… Unspeci… 2016-10-01 00:05:39 NA 0
## 5 0034000001S… Unspeci… 2016-10-01 00:08:12 NA 0
## 6 0033300001o… Unspeci… 2016-10-01 00:10:59 NA 0
## 7 0033300001h… Unspeci… 2016-10-01 00:14:00 NA 0
## 8 0034000001S… Unspeci… 2016-10-01 00:14:32 NA 0
## 9 0034000001S… Unspeci… 2016-10-01 00:18:20 NA 0
## 10 0034000001S… Unspeci… 2016-10-01 00:20:27 NA 0
## # … with 789,263 more rows
#Add the average touches and average time
stats <- master %>%
filter(Conversion == 1) %>%
arrange(REAgentID, Date) %>%
group_by(REAgentID) %>%
summarise(Touch = n(), Days = (max(Date) - min(Date)))
stats %>% summarise(Average_Touch = mean(Touch),
Max_Touch = max(Touch),
Average_Days = mean(Days),
Max_Days = max(Days))
## # A tibble: 1 x 4
## Average_Touch Max_Touch Average_Days Max_Days
## <dbl> <int> <drtn> <drtn>
## 1 5.26 46 85.0919 days 269.6749 days
#Distribution plot for days to conversion.
ggplot(stats %>% filter(Days > 0), aes(x = Days)) +
theme_minimal() +
geom_histogram(fill = '#4e79a7', binwidth = 7)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
#Distribution plot for touches to conversion.
ggplot(stats, aes(x = Touch)) +
theme_minimal() +
geom_histogram(fill = '#4e79a7', binwidth = 1)
# aggregating channels to the paths for each customer
paths <- master %>%
filter(!is.na(REAgentID)) %>%
arrange(REAgentID, Date) %>%
group_by(REAgentID) %>%
summarise(path = paste(Channel, collapse = ' > '),
conv = mean(Conversion)) %>%
ungroup()
paths
## # A tibble: 298,342 x 3
## REAgentID path conv
## <chr> <chr> <dbl>
## 1 0033000000Ea22X… Sales Rep Called RE Agent > Sales Rep Called RE … 0
## 2 0033000000EOCWG… Email 0
## 3 0033000000EqCu4… Unspecified > Unspecified > Unspecified > Unspec… 0
## 4 0033000000EqCv6… Unspecified 0
## 5 0033000000EqCw2… Sales Rep Called RE Agent > Sales Rep Called RE … 0
## 6 0033000000EqD1S… Organic > Sales Rep Called RE Agent > Sales Rep … 1
## 7 0033000000EqD1Y… Sales Rep Called RE Agent > RE Agent Called Sale… 0
## 8 0033000000FhHV9… Sales Rep Called RE Agent 0
## 9 0033000000GBnTr… Sales Rep Called RE Agent 0
## 10 0033000000GBrMA… RE Agent Called Sales Rep > Sales Rep Called RE … 0
## # … with 298,332 more rows
#Calculating the Markov model
markov <- markov_model(paths,
var_path = 'path',
var_conv = 'conv',
out_more = TRUE)
#Calculating the other models
h_mod <- heuristic_models(paths,
var_path = 'path',
var_conv = 'conv')
#Merges the two data frames on the "channel_name" column.
results <- merge(h_mod, markov$result, by='channel_name')
#Rename the columns
colnames(results) <- c('channel_name', 'first_touch', 'last_touch', 'linear_touch', 'markov_model')
results
## channel_name first_touch last_touch linear_touch
## 1 Bing Search 196 22 72.233785
## 2 Email 858 227 414.892935
## 3 Google Search 1300 135 439.074995
## 4 Meeting 4 3 3.916667
## 5 Organic 6787 2115 3650.616441
## 6 Paid Other 380 42 135.014880
## 7 Paid Social 684 65 210.588133
## 8 RE Agent Called Sales Rep 1762 3686 3075.819770
## 9 Sales Rep Called RE Agent 4088 9642 7996.318503
## 10 Unspecified 376 498 436.523892
## markov_model
## 1 103.421530
## 2 680.804275
## 3 690.893664
## 4 3.035995
## 5 3996.760609
## 6 199.994420
## 7 346.830681
## 8 3925.188787
## 9 5748.776963
## 10 739.293076
#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results_graph <- melt(results, id='channel_name')
# Plot the total conversions
ggplot(results_graph, aes(x = reorder(channel_name, value), y = value, fill =variable)) +
geom_bar(stat='identity', position='dodge') +
ggtitle('Total Conversions') +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
coord_flip() +
theme_classic() +
scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
theme(axis.text = element_text(face = "bold", size = 10),
axis.title = element_blank(),
axis.ticks.x = element_blank(),
legend.title = element_text(face = "bold"),
legend.position = c(0.9, 0.2),
plot.title = element_text(hjust=0.5, face ="bold"),
plot.subtitle = element_text(hjust = 0.5))
#Listing the results for the last touch attribution for smarketing
results %>% select(channel_name, last_touch) %>%
mutate(conversion_rate = last_touch/Channel_Data$Total) %>%
mutate(CPA = Channel_Data$Total_Cost/last_touch) %>%
mutate(ROI = ((Channel_Data$NPV*last_touch) / Channel_Data$Total_Cost))
## channel_name last_touch conversion_rate CPA
## 1 Bing Search 22 0.0088388911 9050.9091
## 2 Email 227 0.0123349454 810.7048
## 3 Google Search 135 0.0083788481 11934.8148
## 4 Meeting 3 0.0029910269 100300.0000
## 5 Organic 2115 0.0142778063 0.0000
## 6 Paid Other 42 0.0059030218 18634.5238
## 7 Paid Social 65 0.0050757457 29552.3077
## 8 RE Agent Called Sales Rep 3686 0.0007548157 13248.2664
## 9 Sales Rep Called RE Agent 9642 0.0039985253 2500.9220
## 10 Unspecified 498 0.0074514087 0.0000
## ROI
## 1 0.77340297
## 2 9.86795631
## 3 0.58651936
## 4 0.07477567
## 5 Inf
## 6 0.37564684
## 7 0.23686813
## 8 0.56611180
## 9 2.99889400
## 10 Inf
#Listing the results for the first touch attribution for smarketing
results %>% select(channel_name, first_touch) %>%
mutate(conversion_rate = first_touch/Channel_Data$Total) %>%
mutate(CPA = Channel_Data$Total_Cost/first_touch) %>%
mutate(ROI = ((Channel_Data$NPV*first_touch) / Channel_Data$Total_Cost))
## channel_name first_touch conversion_rate CPA
## 1 Bing Search 196 0.0787464845 1015.9184
## 2 Email 858 0.0466228332 214.4872
## 3 Google Search 1300 0.0806852036 1239.3846
## 4 Meeting 4 0.0039880359 75225.0000
## 5 Organic 6787 0.0458172441 0.0000
## 6 Paid Other 380 0.0534082923 2059.6053
## 7 Paid Social 684 0.0534124629 2808.3333
## 8 RE Agent Called Sales Rep 1762 0.0003608208 27714.5914
## 9 Sales Rep Called RE Agent 4088 0.0016952885 5898.7011
## 10 Unspecified 376 0.0056259632 0.0000
## ROI
## 1 6.8903174
## 2 37.2982666
## 3 5.6479643
## 4 0.0997009
## 5 Inf
## 6 3.3987095
## 7 2.4925816
## 8 0.2706156
## 9 1.2714664
## 10 Inf
#Listing the results for the linear attribution model for smarketing
results %>% select(channel_name, linear_touch) %>%
mutate(conversion_rate = linear_touch/Channel_Data$Total) %>%
mutate(CPA = Channel_Data$Total_Cost/linear_touch) %>%
mutate(ROI = ((Channel_Data$NPV*linear_touch) / Channel_Data$Total_Cost))
## channel_name linear_touch conversion_rate CPA
## 1 Bing Search 72.233785 0.0290212072 2756.6048
## 2 Email 414.892935 0.0225448533 443.5602
## 3 Google Search 439.074995 0.0272514272 3669.5326
## 4 Meeting 3.916667 0.0039049518 76825.5319
## 5 Organic 3650.616441 0.0246443472 0.0000
## 6 Paid Other 135.014880 0.0189760899 5796.7685
## 7 Paid Social 210.588133 0.0164444895 9121.5966
## 8 RE Agent Called Sales Rep 3075.819770 0.0006298636 15876.4536
## 9 Sales Rep Called RE Agent 7996.318503 0.0033160633 3015.6240
## 10 Unspecified 436.523892 0.0065315621 0.0000
## ROI
## 1 2.5393556
## 2 18.0358826
## 3 1.9075999
## 4 0.0976238
## 5 Inf
## 6 1.2075694
## 7 0.7674095
## 8 0.4723977
## 9 2.4870475
## 10 Inf
#Listing the results for the markov chain attribution for smarketing
results %>% select(channel_name, markov_model) %>%
mutate(conversion_rate = markov_model/Channel_Data$Total) %>%
mutate(CPA = Channel_Data$Total_Cost/markov_model) %>%
mutate(ROI = ((Channel_Data$NPV*markov_model) / Channel_Data$Total_Cost))
## channel_name markov_model conversion_rate CPA
## 1 Bing Search 103.421530 0.0415514383 1925.3244
## 2 Email 680.804275 0.0369942007 270.3126
## 3 Google Search 690.893664 0.0428806892 2332.0521
## 4 Meeting 3.035995 0.0030269145 99110.8282
## 5 Organic 3996.760609 0.0269810750 0.0000
## 6 Paid Other 199.994420 0.0281088432 3913.3592
## 7 Paid Social 346.830681 0.0270834516 5538.4374
## 8 RE Agent Called Sales Rep 3925.188787 0.0008037966 12440.9583
## 9 Sales Rep Called RE Agent 5748.776963 0.0023840106 4194.6122
## 10 Unspecified 739.293076 0.0110617970 0.0000
## ROI
## 1 3.63575085
## 2 29.59536055
## 3 3.00164824
## 4 0.07567286
## 5 Inf
## 6 1.78874457
## 7 1.26389441
## 8 0.60284745
## 9 1.78800796
## 10 Inf
results_ROI <- results %>%
mutate(first_ROI = (Channel_Data$NPV*first_touch) / Channel_Data$Total_Cost) %>%
mutate(last_ROI = (Channel_Data$NPV*last_touch) / Channel_Data$Total_Cost) %>%
mutate(linear_ROI = (Channel_Data$NPV*linear_touch) / Channel_Data$Total_Cost) %>%
mutate(markov_ROI = (Channel_Data$NPV*markov_model) / Channel_Data$Total_Cost) %>%
select(channel_name, first_ROI, last_ROI, linear_ROI, markov_ROI) %>%
filter(channel_name != "Organic" & channel_name != "Unspecified")
results_ROI
## channel_name first_ROI last_ROI linear_ROI markov_ROI
## 1 Bing Search 6.8903174 0.77340297 2.5393556 3.63575085
## 2 Email 37.2982666 9.86795631 18.0358826 29.59536055
## 3 Google Search 5.6479643 0.58651936 1.9075999 3.00164824
## 4 Meeting 0.0997009 0.07477567 0.0976238 0.07567286
## 5 Paid Other 3.3987095 0.37564684 1.2075694 1.78874457
## 6 Paid Social 2.4925816 0.23686813 0.7674095 1.26389441
## 7 RE Agent Called Sales Rep 0.2706156 0.56611180 0.4723977 0.60284745
## 8 Sales Rep Called RE Agent 1.2714664 2.99889400 2.4870475 1.78800796
#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results_ROI <- melt(results_ROI, id='channel_name')
# Plot the ROIs
ggplot(results_ROI, aes(x = reorder(channel_name, value), y = value, fill =variable)) +
geom_bar(stat='identity', position='dodge') +
ggtitle('ROI Per Channel') +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
coord_flip() +
theme_classic() +
scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
theme(axis.text = element_text(face = "bold", size = 10),
axis.title = element_blank(),
axis.ticks.x = element_blank(),
legend.title = element_text(face = "bold"),
legend.position = c(0.9, 0.2),
plot.title = element_text(hjust=0.5, face ="bold"),
plot.subtitle = element_text(hjust = 0.5))
marketing_master <- master %>% filter(Channel != "Meeting" & Channel != "RE Agent Called Sales Rep" & Channel != "Sales Rep Called RE Agent")
# aggregating channels to the paths for each customer
paths2 <- marketing_master %>%
arrange(REAgentID, Date) %>%
group_by(REAgentID) %>%
summarise(path = paste(Channel, collapse = ' > '),
conv = mean(Conversion)) %>%
ungroup()
#Calculating the Markov model
markov <- markov_model(paths2,
var_path = 'path',
var_conv = 'conv',
out_more = TRUE)
#Calculating the other models
h_mod <- heuristic_models(paths2,
var_path = 'path',
var_conv = 'conv')
#Merges the two data frames on the "channel_name" column.
results2 <- merge(h_mod, markov$result, by='channel_name')
#Rename the columns
colnames(results2) <- c('channel_name', 'first_touch', 'last_touch', 'linear_touch', 'markov_model')
results2
## channel_name first_touch last_touch linear_touch markov_model
## 1 Bing Search 208 139 168.6668 191.7925
## 2 Email 1028 1011 1014.3919 1221.9004
## 3 Google Search 1339 908 1109.8847 1245.7677
## 4 Organic 7946 8299 8185.9571 7349.3587
## 5 Paid Other 386 279 325.9168 364.4154
## 6 Paid Social 698 431 542.7235 620.3554
## 7 Unspecified 736 1274 993.4594 1347.4099
#Transforms the dataset into a data frame that ggplot2 can use to graph the outcomes
results2_graph <- melt(results2, id='channel_name')
m_channel <- Channel_Data %>% filter(channel_name != "Meeting" & channel_name != "Sales Rep Called RE Agent" & channel_name != "RE Agent Called Sales Rep")
results2 %>% select(channel_name, last_touch) %>%
mutate(conversion_rate = last_touch/m_channel$Total) %>%
mutate(CPA = m_channel$Total_Cost/last_touch) %>%
mutate(ROI = (m_channel$NPV*last_touch) / m_channel$Total_Cost)
## channel_name last_touch conversion_rate CPA ROI
## 1 Bing Search 139 0.05584572 1432.5180 4.886501
## 2 Email 1011 0.05493670 182.0277 43.949356
## 3 Google Search 908 0.05635551 1774.4493 3.944886
## 4 Organic 8299 0.05602436 0.0000 Inf
## 5 Paid Other 279 0.03921293 2805.1971 2.495368
## 6 Paid Social 431 0.03365610 4456.8445 1.570618
## 7 Unspecified 1274 0.01906244 0.0000 Inf
results2 %>% select(channel_name, markov_model) %>%
mutate(conversion_rate = markov_model/m_channel$Total) %>%
mutate(CPA = m_channel$Total_Cost/markov_model) %>%
mutate(ROI = (m_channel$NPV*markov_model) / m_channel$Total_Cost)
## channel_name markov_model conversion_rate CPA ROI
## 1 Bing Search 191.7925 0.07705603 1038.2056 6.742402
## 2 Email 1221.9004 0.06639681 150.6097 53.117444
## 3 Google Search 1245.7677 0.07731925 1293.3390 5.412347
## 4 Organic 7349.3587 0.04961358 0.0000 Inf
## 5 Paid Other 364.4154 0.05121790 2147.6866 3.259321
## 6 Paid Social 620.3554 0.04844256 3096.4508 2.260653
## 7 Unspecified 1347.4099 0.02016085 0.0000 Inf
# Plot the total conversions
ggplot(results2_graph, aes(channel_name, value, fill = variable)) +
geom_bar(stat='identity', position='dodge') +
ggtitle('Total Conversions') +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
theme_classic() +
scale_fill_manual(values = c("#08519c","#3182bd","#6baed6", "#bdd7e7")) +
theme(axis.text = element_text(face = "bold", size = 10),
axis.title = element_blank(),
axis.ticks.x = element_blank(),
legend.title = element_text(face = "bold"),
legend.position = c(0.9, 0.8),
plot.title = element_text(hjust=0.5, face ="bold"),
plot.subtitle = element_text(hjust = 0.5))
# Construct conversions sequences for all visitors
master1 = master %>%
group_by(REAgentID) %>%
arrange(Date) %>%
mutate(order_seq = ifelse(Conversion > 0, 1, NA)) %>%
mutate(order_seq = lag(cumsum(ifelse(is.na(order_seq), 0, order_seq)))) %>%
mutate(order_seq = ifelse((row_number() == 1) & (Conversion > 0),
-1, ifelse(row_number() == 1, 0, order_seq))) %>%
ungroup()
# Create a modified channel stacks data frame
channel_stacks = master1 %>%
group_by(REAgentID, order_seq) %>%
#first remove irrelevant hits:
filter(!is.na(Channel) | Conversion>0) %>%
#next remove repeated values with a lag function:
filter((Channel != lag(Channel, default="1")) | Conversion>0) %>%
#now concatenate the sequence into a single row:
summarize(
path = paste(Channel[which(!is.na(Channel))], collapse=">"),
# for Spark SQL or PostgreSQL:
# path = concat_ws(" > ", collect_list(Channel))
Conversion = sum(Conversion)
) %>% ungroup() %>%
#next roll up each unique path by count and conversion:
group_by(path) %>%
summarize(
Conversion = sum(Conversion),
path_count = n()
) %>% ungroup() %>%
#last create a conversion rate column and pull it out of Spark:
mutate(
conversion_rate = Conversion/path_count
) %>%
filter(path != "") %>%
collect()
# Visualizing customer paths with a Sankey Diagram
# Creating a list of channels for convinience
channel_stacks$path_list = strsplit(x=channel_stacks$path,split=">")
# set the depth of the Sankey Diagram
depth = 7
#Generate node labels and label length vectors
node_labels=rep(list(list()),depth)
label_length = list()
for(i in 1:depth){
for(j in 1:length(channel_stacks$path)){
if(!is.na(channel_stacks$path_list[j][[1]][i]))
node_labels[[i]][j] = channel_stacks$path_list[j][[1]][i]
}
node_labels[[i]] = unique(unlist(node_labels[[i]]))
node_labels[[i]] = node_labels[[i]][order(node_labels[[i]])]
label_length[[i]] = length(node_labels[[i]])
}
node_labels = unlist(node_labels)
label_length = unlist(label_length)
# Build a data frame to fill out with each path view
combos = NULL
for(i in 1:(depth-1)){
for(j in (1 + sum(label_length[1:i-1])):(label_length[i] + sum(label_length[1:i-1]))){
for(k in (1 + label_length[i] + sum(label_length[1:i-1])):(label_length[i+1] + label_length[i] + sum(label_length[1:i-1]))){
combos = rbind(combos, c(i,j,k,0))
}
}
}
combos = as.data.frame(combos)
names(combos) = c("step","source","target","value")
#Populate the combo table
for(i in 1:(dim(combos)[1])){
for(j in 1:(dim(channel_stacks)[1])){
combos$value[i] = sum(combos$value[i], ifelse(
(node_labels[combos$source[i]] == channel_stacks$path_list[j][[1]][combos$step[i]]) &
(node_labels[combos$target[i]] == channel_stacks$path_list[j][[1]][combos$step[i]+1]),
channel_stacks$path_count[j],0), na.rm = TRUE)
}
}
#Add a node to populate with conversion values
uniques = unique(c(combos$source,combos$target))
converts = as.data.frame(list("step"=rep(0,length(uniques)), "source"=uniques, "target"=rep(max(uniques)+1,length(uniques)), "value"=rep(0,length(uniques))))
combos = rbind(combos,converts)
for(i in 1:(dim(channel_stacks)[1])){
stack_depth = min(depth,length(channel_stacks$path_list[i][[1]]))
index_val = which(combos$step==0 & combos$source==(which(node_labels == channel_stacks$path_list[i][[1]][stack_depth]) + ifelse(stack_depth>1, sum(label_length[1:(stack_depth-1)]),0)))
combos$value[index_val] = combos$value[index_val] + channel_stacks$Conversion[i]
}
#Populate the conversion node values
display_node_labels = node_labels
for(i in 1:length(label_length)){
for(j in 1:label_length[i]){
display_node_labels[j+ifelse(i==1,0,sum(label_length[1:(i-1)]))] = paste0(i,":",node_labels[j+ifelse(i==1,0,sum(label_length[1:(i-1)]))])
}
}
display_node_labels = c(display_node_labels, "Conversion")
#Generate Sankey diagram
p <- plot_ly(
type = "sankey",
orientation = "v",
node = list(
label = display_node_labels,
#color = node_colors,
pad = 10,
thickness = 30,
line = list(
color = "white",
width = 0
)
),
link = list(
source = combos$source-1, # convert to zero index
target = combos$target-1, # convert to zero index
value = combos$value, #size of connection
color = "rgba(0, 0, 0, 0.2)"
)
) %>%
layout(
title = "Conversion Flow Diagram",
font = list(
size = 10
)
)
p